suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
Settings
data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'
wd <- "/Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/"
setwd(wd)
figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Metagene/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
Functions
add_genetype2 <- function(df) {
df |>
mutate(
genetype2 = case_when(
gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
.default = gene_type
)
)
}
calc_kmer_relative_position <- function(df) {
df |>
left_join(espresso_AsPC1_transcript_seq_length |> select(-seq)) |>
mutate(
rel_kmer_start = kmer_start / length,
rel_kmer_middle = kmer_middle / length,
rel_kmer_end = kmer_end / length
)
}
Read data
DRS_methylated_positions <-
read_tsv(
paste0(wd, 'Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-10.tsv.gz')
)
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions
## # A tibble: 605 × 65
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 43 ACACA 1
## 5 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 6 ENST00000389680.2 MT-RNR1-201 71 GTTCA 1
## 7 ENST00000389680.2 MT-RNR1-201 73 TCACC 1
## 8 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 9 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 10 ENST00000389680.2 MT-RNR1-201 138 GCTTA 1
## # ℹ 595 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
espresso_AsPC1_transcript_seq_length <-
read_tsv(
'/Volumes/Mitsu_NGS_3/METTL2A/Database/Custom/Espresso_AsPC1/Espresso_AsPC1.transcripts.tsv',
col_names = c('transcript_id', 'seq', 'length')
)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, seq
## dbl (1): length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_transcript_seq_length |>
export_tsv()
##
## Exported to: Tables/espresso_AsPC1_transcript_seq_length_2024-04-15.tsv
## # A tibble: 36,717 × 3
## transcript_id seq length
## <chr> <chr> <dbl>
## 1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCC… 987
## 2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCC… 2252
## 3 ENST00000420393.5 CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGGCAGAGTTGGTG… 854
## 4 ENST00000698415.1 GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTCTGCTAGCCAAA… 6597
## 5 ENST00000698416.1 CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTCACACACTAACC… 5500
## 6 ENST00000488263.5 AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTTATCTTTCTTGG… 4528
## 7 ENST00000424814.5 GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCTGCCGCTCCTG… 2038
## 8 ENST00000231948.9 AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCACAACATGGGCA… 2187
## 9 ENST00000432408.6 GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAGATCAGCAGGAC… 2203
## 10 ENST00000459840.5 ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTGATTAAATTGCT… 723
## # ℹ 36,707 more rows
Calculate range of kmer
DRS_methylated_positions_relative_range <-
DRS_methylated_positions |>
mutate(
kmer_start = position + 1, kmer_end = position + 5,
kmer_middle = position + 3
) |>
select(transcript_id, gene_name, seqname, gene_type, contains('kmer')) |>
add_genetype2() |>
calc_kmer_relative_position()
## Joining with `by = join_by(transcript_id)`
DRS_methylated_positions_relative_range |>
export_tsv(outdir = tabledir)
##
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-15.tsv
## # A tibble: 605 × 13
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACACA 44 48
## 5 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCG 58 62
## 6 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA GTTCA 72 76
## 7 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA TCACC 74 78
## 8 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCT 76 80
## 9 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ATCAA 94 98
## 10 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA GCTTA 139 143
## # ℹ 595 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
DRS_methylated_positions_relative_range$genetype2 |> unique()
## [1] "mRNA" "Mt_rRNA" "mt-mRNA" NA
Plot
plot_distribution_DRS_relpositions_each_genetype2 <- function(genetype) {
plot_basename <- paste0('DRS_m3Csites_distribution_', genetype)
distribution_plot <-
DRS_methylated_positions_relative_range |>
filter(genetype2 == genetype) |>
ggplot(aes(x = rel_kmer_middle)) +
geom_histogram(bins = 50)
distribution_plot |>
ggsave_multiple_formats(
width = 4, height = 2.5, fontsize = 7,
basename = plot_basename, outdir = figdir
)
}
plot_distribution_DRS_relpositions_each_genetype2('Mt_rRNA')

plot_distribution_DRS_relpositions_each_genetype2('mt-mRNA')

plot_distribution_DRS_relpositions_each_genetype2('mRNA')

plot_density_DRS_relpositions_each_genetype2 <- function(genetype) {
plot_basename <- paste0('DRS_m3Csites_density_', genetype)
distribution_plot <-
DRS_methylated_positions_relative_range |>
filter(genetype2 == genetype) |>
ggplot(aes(x = rel_kmer_middle)) +
geom_density()
print(distribution_plot)
distribution_plot |>
ggsave_multiple_formats(
width = 4, height = 2.5, fontsize = 7,
basename = plot_basename, outdir = figdir
)
}
c('Mt_rRNA', 'mt-mRNA', 'mRNA') |>
walk(plot_density_DRS_relpositions_each_genetype2)



distribution_plot_groupedby_genetype2 <-
DRS_methylated_positions_relative_range |>
filter(!is.na(genetype2)) |>
ggplot(aes(x = rel_kmer_middle)) +
geom_histogram(bins = 50) +
facet_wrap( ~ genetype2, scales = 'free_y', ncol = 1)
distribution_plot_groupedby_genetype2 |>
ggsave_multiple_formats(
width = 3.5, height = 7, outdir = figdir
)
